library(Rcpp)
# Define the Rational Class
setClass(
"Rational",
slots = c(numerator = "numeric", denominator = "numeric"),
validity = function(object) {
if (object@denominator == 0) {
stop("Denominator cannot be 0.")
}
TRUE
}
)Statistics 506- Problem Set #5
Link to GitHub
github: https://github.com/garrettpinkston2015/Computational-Methods
Problem 1 - OOP Programming
Create a class to represent rational numbers (numbers of the form for integers \(a\) and \(b\). Do this using S4.
a) For the rational class, define the following:
- A validator that ensures the denominator is non-zero. (Defined above in class)
# Constructor for the Rational Class
Rational <- function(numerator, denominator = 1) {
new("Rational", numerator = numerator, denominator = denominator)
}- A show method.
# Show Method
setMethod(
"show",
"Rational",
function(object) {
cat(sprintf("%d/%d\n", object@numerator, object@denominator))
}
)- A simplify method, to obtain the simplest form (e.g. simplify(2/4) produces 1/2).
# Simplify Method
setGeneric("simplify", function(object) standardGeneric("simplify"))[1] "simplify"
setMethod(
"simplify",
"Rational",
function(object) {
gcd <- compute_gcd(object@numerator, object@denominator)
object@numerator <- object@numerator / gcd
object@denominator <- object@denominator / gcd
return(object)
}
)- A quotient method (e.g. quotient(3/7) produces .42857143…). It should support a digits argument but only in the printing, not the returned result (Hint: what does print return?).
# Quotient Method
setGeneric("quotient", function(object, digits = 7) standardGeneric("quotient"))[1] "quotient"
setMethod(
"quotient",
"Rational",
function(object, digits = 7) {
if (!is.numeric(digits) || digits != as.integer(digits) || digits < 0) {
stop("Digits must be a non-negative integer.")
}
result <- object@numerator / object@denominator
cat(round(result, digits), "\n")
return(result)
}
)- Addition, subtraction, multiplication, division. These should all return a rational.
# Arithmetic Methods (+, -, *, /)
setMethod(
"+",
c("Rational", "Rational"),
function(e1, e2) {
lcm_den <- compute_lcm(e1@denominator, e2@denominator)
new_num <- (e1@numerator * (lcm_den / e1@denominator)) +
(e2@numerator * (lcm_den / e2@denominator))
return(simplify(Rational(new_num, lcm_den)))
}
)
setMethod(
"-",
c("Rational", "Rational"),
function(e1, e2) {
lcm_den <- compute_lcm(e1@denominator, e2@denominator)
new_num <- (e1@numerator * (lcm_den / e1@denominator)) -
(e2@numerator * (lcm_den / e2@denominator))
return(simplify(Rational(new_num, lcm_den)))
}
)
setMethod(
"*",
c("Rational", "Rational"),
function(e1, e2) {
return(simplify(Rational(e1@numerator * e2@numerator, e1@denominator * e2@denominator)))
}
)
setMethod(
"/",
c("Rational", "Rational"),
function(e1, e2) {
if (e2@numerator == 0) {
stop("Division by zero is not allowed.")
}
return(simplify(Rational(e1@numerator * e2@denominator, e1@denominator * e2@numerator)))
}
)- You’ll (probably) need GCD and LCM as part of some of these calculations; include these functions using Rcpp. Even if you don’t need these functions for another calculation, include them.
cppFunction("
#include <numeric>`
int compute_gcd(int a, int b) {
return std::gcd(a, b);
}")
cppFunction("
#include <numeric>
int compute_lcm(int a, int b) {
return std::lcm(a, b);
}")b) Use your rational class to create three objects:
r1 <- Rational(24, 6)
r2 <- Rational(7, 230)
r3 <- Rational(0, 4)Evaluate the following code (remember you can tell Quarto not to stop on errors):
r124/6
r30/4
r1 + r2927/230
r1 - r2913/230
r1 * r214/115
r1 / r2920/7
r1 + r34/1
r1 * r30/1
r2 / r3Error in r2/r3: Division by zero is not allowed.
quotient(r1)4
[1] 4
quotient(r2)0.0304348
[1] 0.03043478
quotient(r2, digits = 3)0.03
[1] 0.03043478
quotient(r2, digits = 3.14)Error in quotient(r2, digits = 3.14): Digits must be a non-negative integer.
quotient(r2, digits = "avocado")Error in quotient(r2, digits = "avocado"): Digits must be a non-negative integer.
q2 <- quotient(r2, digits = 3)0.03
q2[1] 0.03043478
quotient(r3)0
[1] 0
simplify(r1)4/1
simplify(r2)7/230
simplify(r3)0/1
c) Show that your validator does not allow the creation of rational’s with 0 denominator, and check other malformed input to your constructor.
# test validator and constructor with invalid inputs
# create a rational number with a zero denominator
tryCatch({
r_invalid <- Rational(1, 0)
}, error = function(e) {
cat("Error:", e$message, "\n")
})Error: Denominator cannot be 0.
# create a rational number with a non-numeric numerator
tryCatch({
r_invalid <- Rational("not_a_number", 2)
}, error = function(e) {
cat("Error:", e$message, "\n")
})Error: invalid class "Rational" object: invalid object for slot "numerator" in class "Rational": got class "character", should be or extend class "numeric"
# create a rational number with a non-numeric denominator
tryCatch({
r_invalid <- Rational(3, "not_a_number")
}, error = function(e) {
cat("Error:", e$message, "\n")
})Error: invalid class "Rational" object: invalid object for slot "denominator" in class "Rational": got class "character", should be or extend class "numeric"
# create a rational number with missing arguments
tryCatch({
r_invalid <- Rational()
}, error = function(e) {
cat("Error:", e$message, "\n")
})Error: argument "numerator" is missing, with no default
# create a rational number with only one argument (should default denominator to 1)
tryCatch({
r_valid <- Rational(5)
r_valid
}, error = function(e) {
cat("Error:", e$message, "\n")
})5/1
Problem 2 - plotly
# read in art data
art <- read.csv("/Users/garrettpinkston/Desktop/Michigan/STAT506/Data/df_for_ml_improved_new_market.csv")Does the distribution of genre of sales across years appear to change?
# load in tidyr for data manipulation
library(tidyr)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
# reshape the 'art' dataset to a long format
df_long <- art %>%
# used pivot_longer to transform all columns starting with "Genre___" into
# key-value pairs
pivot_longer(cols = starts_with("Genre___"),
names_to = "genre", # New column for genre names
values_to = "count") %>% # New column for genre values
# Filter when presence of a specific genre is in a sale
filter(count == 1)
# visualize the yearly genre distribution
df_long %>%
# group data by both year and genre
group_by(year, genre) %>%
# count occurrences for each genre per year
summarize(count = n()) %>%
# group data by year to compute the proportion of each genre
group_by(year) %>%
# calculate proportion of each genre's count relative to all other counts
mutate(proportion = count / sum(count)) %>%
# use ggplot, setting 'year' as x-axis and 'proportion' as y-axis
ggplot(aes(x = year, y = proportion, fill = genre)) +
# using a stacked bar plot to show proportions per genre each year
geom_bar(stat = "identity", position = "fill") +
# adding title, x-axis, y-axis, and legend labels
labs(title = "Distribution of Genre Sales Across Years",
x = "Year",
y = "Proportion of Sales",
fill = "Genre") +
# center the plot title
theme(plot.title = element_text(hjust = 0.5))`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
# load libraries
library(dplyr)
library(tidyr)
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
# reshape data to include genres and calculate yearly averages
df_genre_price <- art %>%
pivot_longer(cols = starts_with("Genre___"),
names_to = "genre",
values_to = "count") %>%
filter(count == 1) %>% # Keep rows where the genre is present
mutate(genre = str_remove(genre, "^Genre___")) %>% # Clean genre names
group_by(year, genre) %>%
summarize(avg_price = mean(price_usd, na.rm = TRUE), .groups = "drop")
# find overall average sales price per year
df_overall_price <- art %>%
group_by(year) %>%
summarize(avg_price = mean(price_usd, na.rm = TRUE), .groups = "drop") %>%
mutate(genre = "Overall")
# tally overall and genre-specific data
df_combined <- bind_rows(df_genre_price, df_overall_price)
# create an interactive plot
plotly_plot <- plot_ly(df_combined,
x = ~year,
y = ~avg_price,
color = ~genre,
type = 'scatter',
mode = 'lines+markers',
line = list(width = 2),
marker = list(size = 6)) %>%
layout(title = "Change in Sales Price Over Time by Genre",
xaxis = list(title = "Year"),
yaxis = list(title = "Average Sales Price (USD)"),
legend = list(title = list(text = "Genre")),
hovermode = "x unified")
# display plot
plotly_plotProblem 3 - data.table
Install and load the package nycflights13.
# load in data
library(nycflights13)a) Generate a table (which can just be a nicely printed tibble) reporting the mean and median departure delay per airport. Generate a second table (which again can be a nicely printed tibble) reporting the mean and median arrival delay per airport. Exclude any destination with under 10 flights. Do this exclusion through code, not manually.
Additionally,
- Order both tables in descending mean delay.
- Both tables should use the airport names not the airport codes.
- Both tables should print all rows.
# load the necessary libraries for data manipulation
library(nycflights13) # contains flight data
library(data.table) # for efficient data manipulation
Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':
between, first, last
# convert the flights and airports data to data.table format
flights_dt <- as.data.table(flights)
airports_dt <- as.data.table(airports)
# part a: compute mean and median departure delays for airports with 10 or more flights
dep_stats <- flights_dt[
# count the number of flights per destination airport
, .(count = .N), by = .(dest)
][
# filter to include only airports with at least 10 flights
count >= 10
][
# join back to the flights data on destination airport
flights_dt, on = "dest"
][
# calculate the mean and median departure delays per destination
, .(
mean_dep_delay = mean(dep_delay, na.rm = TRUE), # average departure delay
median_dep_delay = median(dep_delay, na.rm = TRUE) # median departure delay
), by = .(dest)
][
# join with the airports data to get airport names
airports_dt, on = .(dest = faa)
][
# sort the results by mean departure delay in descending order
order(-mean_dep_delay)
][
# select only relevant columns: airport name, mean, and median delays
, .(name, mean_dep_delay, median_dep_delay)
]
# part a: Compute mean and median arrival delays for airports with 10 or more flights
arr_stats <- flights_dt[
# count the number of flights per destination airport
, .(count = .N), by = .(dest)
][
# filter to include only airports with at least 10 flights
count >= 10
][
# join back to the flights data on destination airport
flights_dt, on = "dest"
][
# calculate the mean and median arrival delays per destination
, .(
mean_arr_delay = mean(arr_delay, na.rm = TRUE), # Average arrival delay
median_arr_delay = median(arr_delay, na.rm = TRUE) # Median arrival delay
), by = .(dest)
][
# join with the airports data to get airport names
airports_dt, on = .(dest = faa)
][
# sort the results by mean arrival delay in descending order
order(-mean_arr_delay)
][
# select only relevant columns: airport name, mean, and median delays
, .(name, mean_arr_delay, median_arr_delay)
]
# print the results for departure and arrival delay statistics
print(dep_stats) name mean_dep_delay median_dep_delay
<char> <num> <num>
1: Columbia Metropolitan 35.57009 14
2: Tulsa Intl 34.90635 8
3: Will Rogers World 30.56881 10
4: Birmingham Intl 29.69485 1
5: Mc Ghee Tyson 28.49396 0
---
1454: Black Rock NA NA
1455: New Haven Rail Station NA NA
1456: Wilmington Amtrak Station NA NA
1457: Washington Union Station NA NA
1458: Penn Station NA NA
print(arr_stats) name mean_arr_delay median_arr_delay
<char> <num> <num>
1: Columbia Metropolitan 41.76415 28
2: Tulsa Intl 33.65986 14
3: Will Rogers World 30.61905 16
4: Jackson Hole Airport 28.09524 15
5: Mc Ghee Tyson 24.06920 2
---
1454: Black Rock NA NA
1455: New Haven Rail Station NA NA
1456: Wilmington Amtrak Station NA NA
1457: Washington Union Station NA NA
1458: Penn Station NA NA
b) How many flights did the aircraft model with the fastest average speed take? Produce a tibble with 1 row, and entires for the model, average speed (in MPH) and number of flights.
# load necessary libraries
library(nycflights13)
library(data.table)
# convert flights and planes datasets to data.table
flights_dt <- as.data.table(flights)
planes_dt <- as.data.table(planes)
# join flights with planes to include model information
flights_planes_dt <- flights_dt[
!is.na(air_time) & !is.na(distance), # exclude rows with missing air_time or distance
][planes_dt, on = "tailnum"]
# calculate speed and aggregate by model
model_speeds <- flights_planes_dt[
, .(
avg_speed_mph = mean(distance / (air_time / 60), na.rm = TRUE), # calculate average speed
num_flights = .N # count number of flights
), by = model
]
# find the model with the fastest average speed
fastest_model <- model_speeds[
avg_speed_mph == max(avg_speed_mph, na.rm = TRUE)
]
# print the result
print(fastest_model) model avg_speed_mph num_flights
<char> <num> <int>
1: 777-222 482.6254 4